The large quantity of data uploaded to the Internet daily presents numerous challenges in information retrieval and analysis. A major challenge of automated text analysis is the identification of acronyms. Acronyms, such as HTML and GAN, are abbreviations formed by combining the first letters of a series of words. Although commonly used, acronyms can often be ambiguous in meaning, with each acronym resulting in numerous possible definitions. According to Liu et al. [1], almost 81% of acronyms used in MEDLINE abstracts are ambiguous, and we expect that this value is higher when considering all webpages on the Internet. Thus, an automatic tool for identifying appropriate definitions for acronyms is essential. To address this problem, we are designing a machine-learning based classifier to match ambiguous acronyms with accurate definitions based on context.
from urllib import urlopen
import re
import csv
import os
from collections import defaultdict, Counter
import operator
import random
from dbFunctions import AcronymDatabase
from sklearn.feature_extraction import text, DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import tree, metrics, svm
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
The first step in our project is preprocessing input data. The algorithm accepts HTML files as input. The urls must be loaded from an external file, and the raw text must be extracted and cleaned. There are two sets of input urls that are loaded into the model:
#Load in csv data (contains list of HTML urls)
def loadHTMLData():
urls = []
with open('data/data.csv', 'rU') as data:
reader = csv.reader(data, dialect=csv.excel_tab)
for row in reader:
if(len((row[0].split(','))[1]) > 0): urls.append((row[0].split(','))[1])
return urls
def loadDuplicateData():
train = []
test = []
with open('data/duplicatedata.csv', 'rU') as data:
reader = csv.reader(data, dialect=csv.excel_tab)
count=0
for row in reader:
if(len((row[0].split(','))[1]) > 0): train.append((row[0].split(','))[2])
if(count%2 == 0 and len((row[0].split(','))[1]) > 0): train.append((row[0].split(','))[3])
elif(count%2 == 1 and len((row[0].split(','))[1]) > 0): test.append((row[0].split(','))[3])
count+=1
return (train, test)
urls = loadHTMLData()
trainingUrlsDuplicates = loadDuplicateData()[0]
testingUrlsDuplicates = loadDuplicateData()[1]
trainingUrls = trainingUrlsDuplicates + urls[:int(0.7*len(urls))]
testingUrls = testingUrlsDuplicates + urls[int(0.7*len(urls)):]
print 'Size of Training Dataset: ', len(trainingUrls)
print 'Size of Testing Dataset: ', len(testingUrls)
#Adapted from NLTK package. Removes HTML markup from given string.
def clean_html(html):
# First we remove inline JavaScript/CSS:
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
# Then we remove html comments. This has to be done before removing regular
# tags since comments can contain '>' characters.
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
# Next we can remove the remaining tags:
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
# Finally, we deal with whitespace
cleaned = re.sub(r" ", " ", cleaned)
cleaned = re.sub(r" ", " ", cleaned)
cleaned = re.sub(r" ", " ", cleaned)
return (cleaned.strip()).split()
#Takes url as input. Returns list of all acronyms in webpage
def identifyAcronyms(rawText):
acronyms = []
#words commonly misidentified as acronyms are manually blacklisted
blacklist = ['ABSTRACT', 'INTRODUCTION', 'CONCLUSION', 'CONCLUSIONS', 'ACKNOWLEDGEMENTS', 'RESULTS']
for i in range(1,len(rawText)-1):
word = rawText[i]
word = re.sub(r'[^\w\s]','',word)
'''
characteristics of an acronym: all capital letters, length > 2,
contains only alphabet characters, not in blacklist, and not part
of a header (identified by determining if surrounding words are in all-caps)
'''
nextIndex = i+1
prevIndex = i-1
if(len(word)>2 and word[:-1].isupper() and word.isalpha() \
and word not in blacklist and not(rawText[i-1].isupper()) \
and not(rawText[i+1].isupper())):
acronyms.append((word, i))
return acronyms
We have implemented a PostGres backend for this project. The database is populated with acronyms, corresponding context, and true definitions. The use of the database allows our project to be easily scalable.
db = AcronymDatabase()
#The csv file definitions.csv contains true definition labels for acronyms in each url. These labels were generated
#programmatically with an algorithm that we designed (generatePredictedDefinitions.py), and then manually refined.
with open('definitions.csv', 'rU') as csvfile:
reader = csv.reader(csvfile)
count=0
for row in reader:
if(count<5215):
ret = db.addTrueDefinition(row[0], row[1].lower(), row[2])
count+=1
print ('Successfully added true definitions to database')
db.close()
db = AcronymDatabase()
for fl in (trainingUrls):
try:
html = urlopen(fl).read()
except:
continue
rawText = clean_html(html)
footerIndices = [i for i, x in enumerate(rawText) if x.lower()=='references']
headerIndices = [i for i, x in enumerate(rawText) if x.lower()=='abstract']
if(len(footerIndices)>0): rawText = rawText[:max(footerIndices)] #remove extraneous information
if(len(headerIndices)>0): rawText = rawText[max(headerIndices):] #remove extraneous information
def findContext(acronym, i):
startIndex=i-15
if (i-10 < 0): startIndex=0
endIndex = i+15
if (i+10 > len(rawText)): endIndex = len(rawText)-1
context = []
for word in rawText[startIndex:endIndex+1]:
word = word.lower()
word = "".join(re.findall("[a-zA-Z]+", word))
if(len(word)==0 or word==acronym.lower()): continue
context.append(word)
return " ".join(context)
#Populate PostGres Database
acronyms = identifyAcronyms(rawText) #list of all acronyms and corresponding index in rawtext
for acronym, i in acronyms:
if(db.getTrueDefinition(acronym, fl)==None): continue #If definition has not been labeled, skip
aid = db.getAcronym(acronym)
if(aid==None):
aid = db.addAcronym(acronym)
true_definition = db.getTrueDefinition(acronym, fl)
context = findContext(acronym, i)
did = db.addDefinition(true_definition, context, fl, aid)
print 'Successfully added all data to database'
db.close()
The following section presents an analysis of the input data in our training and testing sets. We are using two testing datasets, which are differentiated as follows:
#Analysis of training set:
db = AcronymDatabase()
uniqueAcronyms = set()
totalOccurrences = 0
occurrencesPerAcronym = {}
definitionsPerAcronym = defaultdict(set)
for fl in (trainingUrls):
try:
html = urlopen(fl).read()
except:
continue
rawText = clean_html(html)
footerIndices = [i for i, x in enumerate(rawText) if x.lower()=='references']
headerIndices = [i for i, x in enumerate(rawText) if x.lower()=='abstract']
if(len(footerIndices)>0): rawText = rawText[:max(footerIndices)] #remove extraneous information
if(len(headerIndices)>0): rawText = rawText[max(headerIndices):] #remove extraneous information
acronyms = identifyAcronyms(rawText) #list of all acronyms and corresponding index in rawtext
for acronym, i in acronyms:
if(db.getTrueDefinition(acronym, fl)==None): continue #If definition has not been labeled, skip
else:
uniqueAcronyms.add(acronym)
totalOccurrences+=1
definitionsPerAcronym[acronym].add(db.getTrueDefinition(acronym, fl))
print '---Training Dataset: Analysis---'
print 'Number of unique acronyms: ', (len(uniqueAcronyms))
print 'Total number of occurrences of acronyms in text: ', totalOccurrences
print 'Average number of occurrences of each acronym in text: ', (float(totalOccurrences)/len(uniqueAcronyms))
numDefs=0
for elem in definitionsPerAcronym:
numDefs+=len(definitionsPerAcronym[elem])
print 'Average number of definitions per acronym: ', (float(numDefs)/len(uniqueAcronyms))
stdev = 0
for elem in definitionsPerAcronym:
stdev += (((float(numDefs)/len(uniqueAcronyms)) - len(definitionsPerAcronym[elem]))**2)
stdev = stdev/len(definitionsPerAcronym)
stdev = stdev**(0.5)
print 'Standard deviation of number of definitions per acronym: ', stdev
db.close()
#Analysis of testing set (breadth):
db = AcronymDatabase()
uniqueAcronyms = set()
totalOccurrences = 0
occurrencesPerAcronym = {}
definitionsPerAcronym = defaultdict(set)
for fl in (testingUrls):
try:
html = urlopen(fl).read()
except:
continue
rawText = clean_html(html)
footerIndices = [i for i, x in enumerate(rawText) if x.lower()=='references']
headerIndices = [i for i, x in enumerate(rawText) if x.lower()=='abstract']
if(len(footerIndices)>0): rawText = rawText[:max(footerIndices)] #remove extraneous information
if(len(headerIndices)>0): rawText = rawText[max(headerIndices):] #remove extraneous information
acronyms = identifyAcronyms(rawText) #list of all acronyms and corresponding index in rawtext
for acronym, i in acronyms:
if(db.getTrueDefinition(acronym, fl)==None): continue #If definition has not been labeled, skip
else:
uniqueAcronyms.add(acronym)
totalOccurrences+=1
definitionsPerAcronym[acronym].add(db.getTrueDefinition(acronym, fl))
print '---Testing Dataset - Breadth: Analysis---'
print 'Number of unique acronyms: ', (len(uniqueAcronyms))
print 'Total number of occurrences of acronyms in text: ', totalOccurrences
print 'Average number of occurrences of each acronym in text: ', (float(totalOccurrences)/len(uniqueAcronyms))
numDefs=0
for elem in definitionsPerAcronym:
numDefs+=len(definitionsPerAcronym[elem])
print 'Average number of definitions per acronym: ', (float(numDefs)/len(uniqueAcronyms))
stdev = 0
for elem in definitionsPerAcronym:
stdev += (((float(numDefs)/len(uniqueAcronyms)) - len(definitionsPerAcronym[elem]))**2)
stdev = stdev/len(definitionsPerAcronym)
stdev = stdev**(0.5)
print 'Standard deviation of number of definitions per acronym: ', stdev
db.close()
#Analysis of testing set (depth):
db = AcronymDatabase()
uniqueAcronyms = set()
totalOccurrences = 0
occurrencesPerAcronym = {}
definitionsPerAcronym = defaultdict(set)
for fl in (testingUrlsDuplicates):
try:
html = urlopen(fl).read()
except:
print fl
continue
rawText = clean_html(html)
footerIndices = [i for i, x in enumerate(rawText) if x.lower()=='references']
headerIndices = [i for i, x in enumerate(rawText) if x.lower()=='abstract']
if(len(footerIndices)>0): rawText = rawText[:max(footerIndices)] #remove extraneous information
if(len(headerIndices)>0): rawText = rawText[max(headerIndices):] #remove extraneous information
acronyms = identifyAcronyms(rawText) #list of all acronyms and corresponding index in rawtext
for acronym, i in acronyms:
if(db.getTrueDefinition(acronym, fl)==None): continue #If definition has not been labeled, skip
else:
uniqueAcronyms.add(acronym)
totalOccurrences+=1
definitionsPerAcronym[acronym].add(db.getTrueDefinition(acronym, fl))
print '---Testing Dataset - Depth: Analysis---'
print 'Number of unique acronyms: ', (len(uniqueAcronyms))
print 'Total number of occurrences of acronyms in text: ', totalOccurrences
print 'Average number of occurrences of each acronym in text: ', (float(totalOccurrences)/len(uniqueAcronyms))
numDefs=0
for elem in definitionsPerAcronym:
numDefs+=len(definitionsPerAcronym[elem])
print 'Average number of definitions per acronym: ', (float(numDefs)/len(uniqueAcronyms))
stdev = 0
for elem in definitionsPerAcronym:
stdev += (((float(numDefs)/len(uniqueAcronyms)) - len(definitionsPerAcronym[elem]))**2)
stdev = stdev/len(definitionsPerAcronym)
stdev = stdev**(0.5)
print 'Standard deviation of number of definitions per acronym: ', stdev
db.close()
Context is used to create feature vectors for each acronym. The words surrounding an acronym are counted and stored as a feature vector. Stop words (words that provide no meaning, such as "the" and "an") are excuded from feature vectors. The acronym itself is added in as a heavily weighted feature.
db = AcronymDatabase()
#Convert training data to sparse vectors
tokenize = CountVectorizer().build_tokenizer()
true_defs = []
def features(cad):
acronym = cad[0]
context = cad[1]
if(len(cad)==3): true_defs.append(cad[2])
terms = tokenize(context)
d = {acronym: 10}
for t in terms:
if(t not in text.ENGLISH_STOP_WORDS):
d[t] = d.get(t, 0) + 1
return d
#np.set_printoptions(threshold=np.nan)
cadList = db.getContextAcronymList()
vect = DictVectorizer()
X_train = vect.fit_transform(features(d) for d in cadList)
print X_train.toarray()
#print cadList[2]
#print vect.get_feature_names()
#print true_defs
A machine learning classifier was created and fit to the data.
#Tune hyperparameters
parameter_candidates = [
{'C': [1],
'gamma': [0.001, 0.0001]}
]
parameters_nb = [
{'alpha': [0.01, 0.1, 0.5, 1]}
]
parameters_tree = [{"max_depth": [3, None],
"max_features": [1, 2, None],
"min_samples_leaf": [1, 2, 3],
"criterion": ["gini", "entropy"]}]
clf = GridSearchCV(estimator=svm.SVC(), param_grid=parameter_candidates, n_jobs=-1).fit(X_train, true_defs)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
clf = GridSearchCV(estimator=MultinomialNB(), param_grid=parameters_nb, n_jobs=-1, cv=1).fit(X_train, true_defs)
print('Best alpha:',clf.best_estimator_.alpha)
clf = GridSearchCV(estimator=tree.DecisionTreeClassifier(), param_grid=parameters_tree, n_jobs=-1).fit(X_train, true_defs)
print('Best Max Depth:',clf.best_estimator_.max_depth)
print('Best Max Features:',clf.best_estimator_.max_features)
print('Best Min Samples:',clf.best_estimator_.min_samples_leaf)
print('Best Criterion:',clf.best_estimator_.criterion)
clf1 = MultinomialNB(alpha=0.09).fit(X_train, true_defs)
print 'Trained Model 1'
clf2 = svm.LinearSVC(C=1).fit(X_train, true_defs)
print 'Trained Model 2'
clf3 = tree.DecisionTreeClassifier(min_samples_leaf=1).fit(X_train, true_defs)
print 'Trained Model 3'
clf4 = RandomForestClassifier().fit(X_train, true_defs)
print 'Trained Model 4'
The accuracy of the classifier was tested on the training dataset and testing dataset.
#Calculate training accuracy
trainData = []
y_true = []
for fl in (trainingUrls):
#print "URL Index: %d" % urls.index(fl)
try:
html = urlopen(fl).read()
except:
continue
rawText = clean_html(html)
footerIndices = [i for i, x in enumerate(rawText) if x.lower()=='references']
headerIndices = [i for i, x in enumerate(rawText) if x.lower()=='abstract']
if(len(footerIndices)>0): rawText = rawText[:max(footerIndices)] #remove extraneous information
if(len(headerIndices)>0): rawText = rawText[max(headerIndices):] #remove extraneous information
def findContext(acronym, i):
startIndex=i-15
if (i-10 < 0): startIndex=0
endIndex = i+15
if (i+10 > len(rawText)): endIndex = len(rawText)-1
context = []
for word in rawText[startIndex:endIndex+1]:
word = word.lower()
word = "".join(re.findall("[a-zA-Z]+", word))
if(len(word)==0 or word==acronym.lower()): continue
context.append(word)
return " ".join(context)
#Populate PostGres Database
acronyms = identifyAcronyms(rawText) #list of all acronyms and corresponding index in rawtext
for acronym, i in acronyms:
if(db.getTrueDefinition(acronym, fl)==None): continue #Definition has been labeled in database
context = findContext(acronym, i)
trainData.append((acronym, context))
y_true.append(db.getTrueDefinition(acronym, fl))
X_new_counts = vect.transform(features(d) for d in trainData)
predicted1 = clf1.predict(X_new_counts)
predicted2 = clf2.predict(X_new_counts)
predicted3 = clf3.predict(X_new_counts)
predicted4 = clf4.predict(X_new_counts)
#for train, definition, true in zip(trainData, predicted, y_true):
# if(true!=definition): print('%s => %s, %s' % (train[0], definition, true))
#print metrics.precision_recall_fscore_support(y_true, predicted, average='weighted')
print "Prediction Accuracy - Multinomial NB: ", accuracy_score(y_true, predicted1)
print metrics.precision_recall_fscore_support(y_true, predicted1, average='weighted')
print "Prediction Accuracy - SVC: ", accuracy_score(y_true, predicted2)
print metrics.precision_recall_fscore_support(y_true, predicted2, average='weighted')
print "Prediction Accuracy - Decision Tree: ", accuracy_score(y_true, predicted3)
print metrics.precision_recall_fscore_support(y_true, predicted3, average='weighted')
print "Prediction Accuracy - Random Forest: ", accuracy_score(y_true, predicted4)
print metrics.precision_recall_fscore_support(y_true, predicted4, average='weighted')
#print(metrics.classification_report(y_true, predicted1))
#print(metrics.classification_report(y_true, predicted2))
#print(metrics.classification_report(y_true, predicted3))
#print(metrics.classification_report(y_true, predicted4))
testData = []
y_true_test = []
for fl in (testingUrls):
#print "URL Index: %d" % urls.index(fl)
try:
html = urlopen(fl).read()
except:
continue
rawText = clean_html(html)
footerIndices = [i for i, x in enumerate(rawText) if x.lower()=='references']
headerIndices = [i for i, x in enumerate(rawText) if x.lower()=='abstract']
if(len(footerIndices)>0): rawText = rawText[:max(footerIndices)] #remove extraneous information
if(len(headerIndices)>0): rawText = rawText[max(headerIndices):] #remove extraneous information
def findContext(acronym, i):
startIndex=i-15
if (i-10 < 0): startIndex=0
endIndex = i+15
if (i+10 > len(rawText)): endIndex = len(rawText)-1
context = []
for word in rawText[startIndex:endIndex+1]:
word = word.lower()
word = "".join(re.findall("[a-zA-Z]+", word))
if(len(word)==0 or word==acronym.lower()): continue
context.append(word)
return " ".join(context)
#Populate PostGres Database
acronyms = identifyAcronyms(rawText) #list of all acronyms and corresponding index in rawtext
for acronym, i in acronyms:
if(db.getTrueDefinition(acronym, fl)==None): continue #Definition has been labeled in database
if(db.getTrueDefinition(acronym, fl) not in y_true): continue
context = findContext(acronym, i)
testData.append((acronym, context))
y_true_test.append(db.getTrueDefinition(acronym, fl))
X_new_counts = vect.transform(features(d) for d in testData)
predicted1 = clf1.predict(X_new_counts)
predicted2 = clf2.predict(X_new_counts)
predicted3 = clf3.predict(X_new_counts)
predicted4 = clf4.predict(X_new_counts)
#for test, definition, true in zip(testData, predicted, y_true_test):
#if(definition==true): print('Correct: %s => %s, %s' % (test[0], definition, true))
# if(definition!=true): print('Error: %s => %s, %s' % (test[0], definition, true))
print "Prediction Accuracy - Multinomial NB: ", accuracy_score(y_true_test, predicted1)
print metrics.precision_recall_fscore_support(y_true_test, predicted1, average='weighted')
print "Prediction Accuracy - SVC: ", accuracy_score(y_true_test, predicted2)
print metrics.precision_recall_fscore_support(y_true_test, predicted2, average='weighted')
print "Prediction Accuracy - Decision Tree: ", accuracy_score(y_true_test, predicted3)
print metrics.precision_recall_fscore_support(y_true_test, predicted3, average='weighted')
print "Prediction Accuracy - Random Forest: ", accuracy_score(y_true_test, predicted4)
print metrics.precision_recall_fscore_support(y_true_test, predicted4, average='weighted')
#print(metrics.classification_report(y_true_test, predicted1))
#print(metrics.classification_report(y_true_test, predicted2))
#print(metrics.classification_report(y_true_test, predicted3))
#print(metrics.classification_report(y_true_test, predicted4))
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=90)
plt.yticks(tick_marks, classes)
fmt = '.0f' if normalize else 'd'
thresh = cm.max() / 2.
#for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#plt.text(j, i, format(cm[i, j], fmt),
#horizontalalignment="center",
#color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix
cnf_matrix1 = confusion_matrix(y_true_test, predicted1)
cnf_matrix2 = confusion_matrix(y_true_test, predicted2)
cnf_matrix3 = confusion_matrix(y_true_test, predicted3)
cnf_matrix4 = confusion_matrix(y_true_test, predicted4)
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
#plt.figure()
class_names1 = sorted(list(set(predicted1).union(set(y_true_test))))
class_names2 = sorted(list(set(predicted2).union(set(y_true_test))))
class_names3 = sorted(list(set(predicted3).union(set(y_true_test))))
class_names4 = sorted(list(set(predicted4).union(set(y_true_test))))
#plot_confusion_matrix(cnf_matrix, classes=class_names,
# title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plt.figure(figsize=(30,30))
plot_confusion_matrix(cnf_matrix1, classes=class_names1, normalize=True,
title='Normalized confusion matrix - Multinomial NB')
plt.show()
plt.figure(figsize=(30,30))
plot_confusion_matrix(cnf_matrix2, classes=class_names2, normalize=True,
title='Normalized confusion matrix - SVC')
plt.show()
plt.figure(figsize=(30,30))
plot_confusion_matrix(cnf_matrix3, classes=class_names3, normalize=True,
title='Normalized confusion matrix - Decision Tree')
plt.show()
plt.figure(figsize=(30,30))
plot_confusion_matrix(cnf_matrix4, classes=class_names4, normalize=True,
title='Normalized confusion matrix - Random Forest')
plt.show()
testDataDuplicates = []
y_true_test_duplicates = []
for fl in (testingUrlsDuplicates):
try:
html = urlopen(fl).read()
except:
continue
rawText = clean_html(html)
footerIndices = [i for i, x in enumerate(rawText) if x.lower()=='references']
headerIndices = [i for i, x in enumerate(rawText) if x.lower()=='abstract']
if(len(footerIndices)>0): rawText = rawText[:max(footerIndices)] #remove extraneous information
if(len(headerIndices)>0): rawText = rawText[max(headerIndices):] #remove extraneous information
def findContext(acronym, i):
startIndex=i-15
if (i-10 < 0): startIndex=0
endIndex = i+15
if (i+10 > len(rawText)): endIndex = len(rawText)-1
context = []
for word in rawText[startIndex:endIndex+1]:
word = word.lower()
word = "".join(re.findall("[a-zA-Z]+", word))
if(len(word)==0 or word==acronym.lower()): continue
context.append(word)
return " ".join(context)
#Populate PostGres Database
acronyms = identifyAcronyms(rawText) #list of all acronyms and corresponding index in rawtext
for acronym, i in acronyms:
if(db.getTrueDefinition(acronym, fl)==None): continue #Definition has been labeled in database
if(db.getTrueDefinition(acronym, fl) not in y_true): continue
context = findContext(acronym, i)
testDataDuplicates.append((acronym, context))
y_true_test_duplicates.append(db.getTrueDefinition(acronym, fl))
X_new_counts_duplicates = vect.transform(features(d) for d in testDataDuplicates)
predicted1 = clf1.predict(X_new_counts_duplicates)
predicted2 = clf2.predict(X_new_counts_duplicates)
predicted3 = clf3.predict(X_new_counts_duplicates)
predicted4 = clf4.predict(X_new_counts_duplicates)
#for test, definition, true in zip(testDataDuplicates, predicted, y_true_test_duplicates):
#if(definition==true): print('Correct: %s => %s, %s' % (test[0], definition, true))
# if(definition!=true): print('Error: %s => %s, %s' % (test[0], definition, true))
print "Prediction Accuracy - Multinomial NB: ", accuracy_score(y_true_test_duplicates, predicted1)
print metrics.precision_recall_fscore_support(y_true_test_duplicates, predicted1, average='weighted')
print "Prediction Accuracy - SVC: ", accuracy_score(y_true_test_duplicates, predicted2)
print metrics.precision_recall_fscore_support(y_true_test_duplicates, predicted2, average='weighted')
print "Prediction Accuracy - Decision Tree: ", accuracy_score(y_true_test_duplicates, predicted3)
print metrics.precision_recall_fscore_support(y_true_test_duplicates, predicted3, average='weighted')
print "Prediction Accuracy - Random Forest: ", accuracy_score(y_true_test_duplicates, predicted4)
print metrics.precision_recall_fscore_support(y_true_test_duplicates, predicted4, average='weighted')
#print(metrics.classification_report(y_true_test_duplicates, predicted1))
#print(metrics.classification_report(y_true_test_duplicates, predicted2))
#print(metrics.classification_report(y_true_test_duplicates, predicted3))
#print(metrics.classification_report(y_true_test_duplicates, predicted4))
# Compute confusion matrix
cnf_matrix1 = confusion_matrix(y_true_test_duplicates, predicted1)
cnf_matrix2 = confusion_matrix(y_true_test_duplicates, predicted2)
cnf_matrix3 = confusion_matrix(y_true_test_duplicates, predicted3)
cnf_matrix4 = confusion_matrix(y_true_test_duplicates, predicted4)
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
#plt.figure(figsize=(30,30))
class_names1 = sorted(list(set(predicted1).union(set(y_true_test_duplicates))))
class_names2 = sorted(list(set(predicted2).union(set(y_true_test_duplicates))))
class_names3 = sorted(list(set(predicted3).union(set(y_true_test_duplicates))))
class_names4 = sorted(list(set(predicted4).union(set(y_true_test_duplicates))))
#print len(class_names)
#plot_confusion_matrix(cnf_matrix, classes=class_names,
# title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plt.figure(figsize=(30,30))
plot_confusion_matrix(cnf_matrix1, classes=class_names1, normalize=True,
title='Normalized confusion matrix - Multinomial NB')
plt.show()
plt.figure(figsize=(30,30))
plot_confusion_matrix(cnf_matrix2, classes=class_names2, normalize=True,
title='Normalized confusion matrix - SVC')
plt.show()
plt.figure(figsize=(30,30))
plot_confusion_matrix(cnf_matrix3, classes=class_names3, normalize=True,
title='Normalized confusion matrix - Decision Tree')
plt.show()
plt.figure(figsize=(30,30))
plot_confusion_matrix(cnf_matrix4, classes=class_names4, normalize=True,
title='Normalized confusion matrix - Random Forest')
plt.show()
db.close()